package com.zilbo.flamingSailor.TE;
import com.zilbo.flamingSailor.TE.model.Component;
import com.zilbo.flamingSailor.TE.model.PDLink;
import com.zilbo.flamingSailor.TE.model.TextPage;
import com.zilbo.flamingSailor.TE.model.TextPiece;
import org.apache.log4j.Logger;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageXYZDestination;
import org.apache.pdfbox.util.PDFOperator;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;
import java.awt.color.ColorSpace;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import java.awt.image.ColorConvertOp;
import java.awt.image.ColorModel;
import java.awt.image.DataBuffer;
import java.io.File;
import java.io.IOException;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/*
* Copyright 2012 Zilbo.com
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class PDFParser extends PDFTextStripper {
private static final Logger logger = Logger.getLogger(PDFParser.class);
private List<TextPage> textPageList;
private int m_currentPageNo = 0;
private float minHeight;
TextPage currentPage;
PDDocument document;
Map<String, Map<Integer, Double>> normalizedFontCounts;
Map<Integer, Double> normalizedSizes;
Map<String, Double> normalizedFonts;
Integer highestFreqSize;
PDDocumentCatalog catalog;
List allpages;
StringWriter outString;
private String fileName; // for debugging purposes;
double docAvgLeft = 0.0;
double docAvgRight = 0.0;
double docAvgWidth = 0.0;
long docLineCount = 0;
Double docCharDensity = 0.0;
double linesPerPage = 0.0;
Double[] normalizedHistogram=null;
/**
* Constructor
*
* @throws java.io.IOException
*/
public PDFParser() throws IOException {
super();
}
/**
* get a TextPage out of the PDF, ignoring characters smaller than minHeight.
*
* @param pdfFile the File to extract it out of
* @param minHeight minimum height to ignore
* @return a Page
*/
public List<TextPage> getTextPages(File pdfFile, float minHeight) {
fileName = pdfFile.getName();
outString = new StringWriter();
this.minHeight = minHeight;
this.textPageList = new ArrayList<>();
Map<String, Map<Integer, Long>> fontCounts = new HashMap<>();
document = null;
try {
document = PDDocument.load(pdfFile);
catalog = document.getDocumentCatalog();
allpages = catalog.getAllPages();
this.writeText(document, outString);
outString.close();
outString = null;
// document.close();
} catch (IOException e) {
logger.error("I/O Error:" + pdfFile.getName(), e);
} finally {
if (document != null) {
try {
document.close();
document = null;
} catch (IOException e) {
logger.error("I/O error closing file:" + pdfFile.getName(), e);
}
}
}
// the page is currently a set of lines with text pieces.
// next steps
// 1. remove header/footer boilerplate
// 2. get font stats
// 3. construct higher order components
//
TextPage.removeBoilerplate(textPageList, TextPage.LEVENSHTEIN_DISTANCE);
long histogram[] =null;
for (TextPage page : textPageList) {
double avgLeft = page.getAvgLeft();
double avgRight = page.getAvgRight();
double avgWidth = page.getAvgWidth();
long lineCount = page.getLineCount();
Double charDensity = page.getCharDensity();
histogram = Component.mergeHistogram(page.getHistogram(),histogram);
if (lineCount > 0) {
docAvgLeft += avgLeft * lineCount;
docAvgWidth += avgWidth * lineCount;
docAvgRight += avgRight * lineCount;
docCharDensity += charDensity * lineCount;
docLineCount += lineCount;
}
Map<String, Map<Integer, Long>> pageFontCounts = page.getFontCounts();
for (Map.Entry<String, Map<Integer, Long>> e : pageFontCounts.entrySet()) {
Map<Integer, Long> fontTally = fontCounts.get(e.getKey());
if (fontTally == null) {
fontTally = new HashMap<>();
}
for (Map.Entry<Integer, Long> pageFontTally : e.getValue().entrySet()) {
Long tally = fontTally.get(pageFontTally.getKey());
if (tally == null) {
fontTally.put(pageFontTally.getKey(), pageFontTally.getValue());
} else {
fontTally.put(pageFontTally.getKey(), tally + pageFontTally.getValue());
}
}
fontCounts.put(e.getKey(), fontTally);
}
}
docAvgLeft /= docLineCount;
docAvgRight /= docLineCount;
docAvgWidth /= docLineCount;
docCharDensity /= docLineCount;
linesPerPage = docLineCount / textPageList.size();
normalizeFontCounts(fontCounts);
normalizedHistogram = Component.getNormalizedHistogram(histogram);
logger.info( Component.normHistoGramToString(normalizedHistogram)+
String.format(" H:%5.1f W:%6.1f D:%4.2f P:%4.2f",(double)highestFreqSize,docAvgWidth,docCharDensity,1.0));
for (TextPage page : textPageList) {
page.constructPageComponents(highestFreqSize,
this.minFontSize, this.maxFontSize,
normalizedFontCounts, normalizedFonts, normalizedSizes,
docAvgLeft, docAvgRight, docAvgWidth,
docCharDensity, linesPerPage,
normalizedHistogram);
}
return textPageList;
}
int minFontSize = 99999;
int maxFontSize = 0;
public int getMinFontSize() {
return minFontSize;
}
public int getMaxFontSize() {
return maxFontSize;
}
protected void normalizeFontCounts(Map<String, Map<Integer, Long>> fontCounts) {
this.normalizedFontCounts = new HashMap<>();
this.normalizedSizes = new HashMap<>();
this.normalizedFonts = new HashMap<>();
double total = 0.0;
long maxFreq = 0;
highestFreqSize = 0;
minFontSize = 999999;
maxFontSize = 0;
// double sum = 0.0;
for (Map.Entry<String, Map<Integer, Long>> e : fontCounts.entrySet()) {
// unknown fonts are usually used in diagrams/or other wierd things. so ignore em
if (e.getKey().equals("UNKNOWN")) {
continue;
}
Map<Integer, Long> sizeCount = e.getValue();
for (Map.Entry<Integer, Long> l : sizeCount.entrySet()) {
total += l.getValue();
if (l.getValue() > maxFreq) {
maxFreq = l.getValue();
highestFreqSize = l.getKey();
}
maxFontSize = Math.max(maxFontSize, l.getKey());
minFontSize = Math.min(minFontSize, l.getKey());
// sum += l.getKey() * l.getValue();
}
}
// highestFreqSize = sum / total;
for (Map.Entry<String, Map<Integer, Long>> e : fontCounts.entrySet()) {
if (e.getKey().equals("UNKNOWN")) {
continue;
}
Map<Integer, Double> nE = new HashMap<>();
Map<Integer, Long> sizeCount = e.getValue();
Double fontP = 0.0;
for (Map.Entry<Integer, Long> i : sizeCount.entrySet()) {
Double normalized = 1.0 * i.getValue() / total;
fontP += i.getValue();
nE.put(i.getKey(), normalized);
Double sizeScore = normalizedSizes.get(i.getKey());
if (sizeScore == null) {
sizeScore = normalized;
} else {
sizeScore += normalized;
}
normalizedSizes.put(i.getKey(), sizeScore);
}
normalizedFonts.put(e.getKey(), fontP / total);
normalizedFontCounts.put(e.getKey(), nE);
}
}
@Override
protected void startPage(PDPage page) throws IOException {
super.startPage(page);
m_currentPageNo++;
currentPage = new TextPage(m_currentPageNo, this.minHeight);
}
@Override
protected void endPage(PDPage page) throws IOException {
super.endPage(page);
int pieceID = 0;
Map<String, Map<Integer, Long>> fontCounts = new HashMap<>();
List<TextPiece> wordsOfThisPage = new ArrayList<>();
for (List<TextPosition> aCharactersByArticle : charactersByArticle) {
// int len = aCharactersByArticle.size();
for (TextPosition t : aCharactersByArticle) {
// copy information
TextPiece w = new TextPiece(pieceID++);
PDFont font = t.getFont();
PDFontDescriptor fontDescriptor = font.getFontDescriptor();
// w.setFontDescriptor(fontDescriptor);
if (fontDescriptor == null) {
w.setFontName("UNKNOWN");
} else {
w.setFontName(fontDescriptor.getFontName());
}
/*
* 100: a simple step to fix the font size to the normal range, for those documents in unknown codes that PDFBox can not process now
*/
if (t.getFontSize() < 0.3 && t.getYScale() <= 1.0) {
w.setFontSize(t.getFontSize() * 100);
w.setHeight(Math.max(t.getYScale(), t.getFontSize()) * 100);
w.setXScale(t.getXScale());
w.setYScale(t.getYScale());
} else {
if (t.getYScale() < 0.3 && t.getFontSize() <= 1.0) {
w.setYScale(t.getYScale() * 100);
w.setXScale(t.getXScale() * 100);
w.setHeight(Math.max(t.getYScale() * 100, t.getFontSize()));
} else {
w.setFontSize(t.getFontSize());
w.setHeight(Math.max(t.getYScale(), t.getFontSize()));
w.setXScale(t.getXScale());
w.setYScale(t.getYScale());
}
}
Map<Integer, Long> counts = fontCounts.get(w.getFontName());
if (counts == null) {
counts = new HashMap<>();
fontCounts.put(w.getFontName(), counts);
}
Long count = counts.get((int) Math.round(w.getHeight()));
if (count == null) {
count = 1L;
} else {
count += 1L;
}
counts.put((int) Math.round(w.getHeight()), count);
w.setWidth(Math.abs(t.getWidth()));
w.setGeom(t.getX(), t.getY(), w.getWidth(), w.getHeight());
w.setText(t.getCharacter());
w.setWidthOfSpace(t.getWidthOfSpace());
wordsOfThisPage.add(w);
}
}
currentPage.processPage(wordsOfThisPage, fontCounts);
currentPage.setText(outString.getBuffer().toString());
outString.getBuffer().setLength(0);
List<PDAnnotation> annotations = page.getAnnotations();
for (PDAnnotation annotation : annotations) {
if (annotation instanceof PDAnnotationLink) {
PDAnnotationLink l = (PDAnnotationLink) annotation;
PDRectangle rect = l.getRectangle();
PDDestination dest = l.getDestination();
if (dest instanceof PDPageXYZDestination) {
PDPageXYZDestination xyzDestination = (PDPageXYZDestination) dest;
PDPage pageDest = ((PDPageXYZDestination) dest).getPage();
if (rect != null) {
if (xyzDestination.getPageNumber() < 0) {
int pageNumber = allpages.indexOf(pageDest) + 1;
Rectangle2D hotbox = new Rectangle2D.Double(rect.getLowerLeftX(), rect.getLowerLeftY(),
(rect.getUpperRightX() - rect.getLowerLeftX()), (rect.getUpperRightY() - rect.getLowerLeftY()));
Point2D toPoint = new Point2D.Double(xyzDestination.getLeft(), xyzDestination.getTop());
currentPage.addLink(new PDLink(hotbox, pageNumber, toPoint));
}
}
}
}
}
/*
The following code is REALLY raw.
initial testing seemed to show memory leaks, and was REALLY slow.
PDResources r = page.getResources();
Map<String, PDXObjectImage> images = r.getImages();
for (Map.Entry<String, PDXObjectImage> e : images.entrySet()) {
BufferedImage bi = null;
try {
// currentPage.addImage(bi);
// (e.getValue()).write2file("/tmp/II" + e.getKey());
if (e.getValue() instanceof PDJpeg) {
PDJpeg jpg = (PDJpeg) e.getValue();
bi = jpg.getRGBImage();
ColorSpace cs = bi.getColorModel().getColorSpace();
File jpgFile = new File("/tmp/II" + e.getKey() + ".jpg");
if (cs instanceof ColorSpaceCMYK) {
logger.info("Ignoring image with CMYK color space");
} else {
// ImageIO.write(bi, "jpg", jpgFile);
jpg.write2file("/tmp/II"+ e.getKey());
}
} else {
(e.getValue()).write2file("/tmp/II" + e.getKey());
}
} catch (Exception ee) {
logger.info("can't read image ;-(", ee);
}
}
*/
textPageList.add(currentPage);
currentPage = null;
}
public double getDocAvgLeft() {
return docAvgLeft;
}
public double getDocAvgWidth() {
return docAvgWidth;
}
public double getDocAvgRight() {
return docAvgRight;
}
public long getDocLineCount() {
return docLineCount;
}
public Double getDocCharDensity() {
return docCharDensity;
}
public double getLinesPerPage() {
return linesPerPage;
}
public Map<String, Map<Integer, Double>> getNormalizedFontCounts() {
return normalizedFontCounts;
}
public Map<Integer, Double> getNormalizedSizes() {
return normalizedSizes;
}
public Map<String, Double> getNormalizedFonts() {
return normalizedFonts;
}
/* /
@Override
protected void processOperator(PDFOperator operator, List<COSBase> arguments) throws IOException {
switch (operator.getOperation()) {
case "w":
logger.info("Width:" + arguments.get(0));
break;
case "l":
logger.info("line To: (" + arguments.get(0) + "," + arguments.get(1) + ")");
break;
case "m":
logger.info("move To: (" + arguments.get(0) + "," + arguments.get(1) + ")");
break;
case "h":
logger.info("close path");
break;
case "s":
logger.info("close/stroke");
break;
case "f":
case "F":
case "f*":
case "B":
case "B*":
case "b":
case "b*":
logger.info("fill path");
break;
case "n":
logger.info("no-op path (changes clipping path)");
break;
case "W":
logger.info("set clipping path");
break;
case "CS":
// logger.info("stroking color space:" + arguments.get(0));
break;
case "cs":
// logger.info("non-stroking color space:" + arguments.get(0));
break;
case "SC":
// logger.info("stroking color:" + arguments.get(0));
break;
case "sc":
// logger.info("non-stroking color :" + arguments.get(0));
break;
case "G":
case "RG":
case "K":
// logger.info("stroking color:" + operator.getOperation() + ":" + arguments);
break;
case "g":
case "rg":
case "k":
// logger.info("non-stroking color:" + operator.getOperation() + ":" + arguments);
break;
case "S":
logger.info("stroke");
break;
case "c":
logger.info("curve To: (" + arguments.get(4) + "," + arguments.get(5) + ") - via (" +
arguments.get(0) + "," + arguments.get(1) + ") (" +
arguments.get(2) + "," + arguments.get(3) + ")");
break;
case "v":
logger.info("curve To: (" + arguments.get(2) + "," + arguments.get(3) + ") - via (" +
arguments.get(0) + "," + arguments.get(1) + ")");
break;
case "re":
logger.info("rectangle: (" + arguments.get(0) + "," + arguments.get(1) + ") - w (" +
arguments.get(2) + " h" + arguments.get(3));
break;
case "q": // push graphic state
case "Q": // pop graphic state
case "GS":
case "gs":
break;
// text commands
case "Tj": //show a text line
case "TJ": //show a text line
logger.info(operator.getOperation() + "\t" + arguments);
// logger.info("Text:" + arguments);
break;
case "Ts": // rise (super/subscript)
logger.info(operator.getOperation() + "\t" + arguments);
break;
case "BT": // begin text
case "ET": // end text
case "Tc": // charspace
case "Tw": // wordspace
case "Tz": // scale
case "Tf": // fontsize
case "TL": // Text leading (vertical distance between baselines of adjacent lines
case "Tr": // render
case "Td": // start of next line
case "TD": // start of next line
case "Tm": // text matrix
case "T*": // move to start of line
logger.info(operator.getOperation() + "\t" + arguments);
break;
default:
logger.info(operator.getOperation() + "\t" + arguments);
}
super.processOperator(operator, arguments);
}
/**/
}